{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import codecs\n",
    "import csv\n",
    "import time\n",
    "import os\n",
    "import re\n",
    "import gzip\n",
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "PATH_TO_DATA = 'Data/'\n",
    "PATH_TO_DATA_EN = PATH_TO_DATA+\"enwiki/\"\n",
    "PATH_TO_DATA_UK = PATH_TO_DATA+\"ukwiki/\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "ENWIKI_ART_FNMS = []\n",
    "for file in os.listdir(PATH_TO_DATA_EN):\n",
    "    if re.match(r\"enwiki-20180620-pages-meta-current\\d{2}-p\\d+p\\d+.xml_art.csv.gz\", file):\n",
    "        ENWIKI_ART_FNMS.append(file)   "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def unpack(file_name):\n",
    "    file_name_new = file_name.replace(\".gz\",\"\")\n",
    "    with gzip.open(file_name, 'rb') as f_in, open(file_name_new, 'wb') as f_out:\n",
    "        f_out.writelines(f_in)\n",
    "    return file_name_new"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def pack_and_remove(file_name):\n",
    "    file_name_new = file_name+'.gz'\n",
    "    with open(file_name, 'rb') as f_in, gzip.open(file_name_new, 'wb') as f_out:\n",
    "        f_out.writelines(f_in)\n",
    "    os.remove(file_name)\n",
    "    return file_name_new"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Data/enwiki/enwiki-20180620-pages-meta-current01-p10p30303.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current02-p30304p88444.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current03-p88445p200507.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current04-p200511p352689.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current05-p352690p565313.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current06-p565314p892912.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current07-p892914p1268691.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current08-p1268692p1791079.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current09-p1791080p2336422.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current10-p2336425p3046511.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current11-p3046514p3926861.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current12-p3926863p5040436.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current13-p5040438p6197594.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current14-p6197598p7697598.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current14-p7697598p7744799.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current15-p7744803p9244803.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current15-p9244803p9518048.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current16-p11018050p11539266.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current16-p9518050p11018050.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current17-p11539268p13039268.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current17-p13039268p13693071.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current18-p13693074p15193074.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current18-p15193074p16120542.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current19-p16120543p17620543.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current19-p17620543p18754735.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current20-p18754736p20254736.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current20-p20254736p21222156.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current21-p21222158p22722158.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current21-p22722158p23927983.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current22-p23927984p25427984.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current22-p25427984p26823660.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current23-p26823661p28323661.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current23-p28323661p29823661.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current23-p29823661p30503449.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current24-p30503451p32003451.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current24-p32003451p33503451.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current24-p33503451p33952815.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current25-p33952816p35452816.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current25-p35452816p36952816.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current25-p36952816p38067202.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current26-p38067203p39567203.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current26-p39567203p41067203.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current26-p41067203p42567203.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current26-p42567203p42663461.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current27-p42663462p44163462.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current27-p44163462p45663462.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current27-p45663462p47163462.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current27-p47163462p48663462.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current27-p48663462p50163462.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current27-p50163462p51663462.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current27-p51663462p53163462.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current27-p53163462p54663462.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current27-p54663462p56163462.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current27-p56163462p57663462.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current27-p57663462p57726175.xml_art.csv.gz\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'enwiki-20180620-pages-links.csv.gz'"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "first = True\n",
    "for fn in ENWIKI_ART_FNMS:\n",
    "    fn = PATH_TO_DATA_EN+fn\n",
    "    print(fn)\n",
    "    fn_new = unpack(fn)\n",
    "    df_articles = pd.read_csv(fn_new, encoding='ISO-8859-1', quotechar=\"'\")\n",
    "    \n",
    "    if first:\n",
    "        df_articles.to_csv('enwiki-20180620-pages-links.csv', index = False, encoding='ISO-8859-1', quotechar=\"'\", escapechar =\"\\\\\")\n",
    "        first = False\n",
    "    else:\n",
    "        df_articles.to_csv('enwiki-20180620-pages-links.csv', mode='a', header=False, index = False, \n",
    "                           encoding='ISO-8859-1', quotechar=\"'\", escapechar =\"\\\\\")\n",
    "        \n",
    "pack_and_remove('enwiki-20180620-pages-links.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "UKWIKI_ART_FNMS = []\n",
    "for file in os.listdir(PATH_TO_DATA_UK):\n",
    "    if re.match(r\"ukwiki-20180620-pages-meta-current\\d{2}-p\\d+p\\d+.xml_art.csv.gz\", file):\n",
    "        UKWIKI_ART_FNMS.append(file) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Data/ukwiki/ukwiki-20180620-pages-meta-current01-p1p5503930.xml_art.csv.gz\n",
      "Data/ukwiki/ukwiki-20180620-pages-meta-current02-p5503931p11007860.xml_art.csv.gz\n",
      "Data/ukwiki/ukwiki-20180620-pages-meta-current03-p11007861p16511790.xml_art.csv.gz\n",
      "Data/ukwiki/ukwiki-20180620-pages-meta-current04-p16511791p22015720.xml_art.csv.gz\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'ukwiki-20180620-pages-links.csv.gz'"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "first = True\n",
    "for fn in UKWIKI_ART_FNMS:\n",
    "    fn = PATH_TO_DATA_UK+fn\n",
    "    print(fn)\n",
    "    fn_new = unpack(fn)\n",
    "    df_articles = pd.read_csv(fn_new, encoding='UTF-8', quotechar=\"\\\"\")\n",
    "\n",
    "    if first:\n",
    "        df_articles.to_csv('ukwiki-20180620-pages-links.csv', index = False, encoding='UTF-8', quotechar=\"\\\"\")\n",
    "        first = False\n",
    "    else:\n",
    "        df_articles.to_csv('ukwiki-20180620-pages-links.csv', mode='a', header=False, index = False, \n",
    "                           encoding='UTF-8', quotechar=\"\\\"\")\n",
    "\n",
    "pack_and_remove('ukwiki-20180620-pages-links.csv')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
